In [12]:
# General packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# for scaling the data
from sklearn.preprocessing import StandardScaler 

# for clustering
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_samples, silhouette_score

# Set display options to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set the ggplot style 
plt.style.use('ggplot')

# Load the dataset
Data = pd.read_csv('Documents/Mastry_Project2/FinalSQL.csv')
In [13]:
# Excluding user_id column as it doesn't make sense to get its statistical summary
selected_columns = cohort_users.columns.difference(['user_id'])

# Generate the statistical summary for selected columns
cohort_users[selected_columns].describe().T
Out[13]:
count mean std min 25% 50% 75% max
age 5998.0 41.014672 12.046715 16.00 34.0000 41.000 48.0000 88.00
avg_booking_departure_gap_days_flights 5206.0 21.818417 48.355668 2.35 6.6100 7.645 9.0200 365.08
avg_booking_departure_gap_days_hotels 5435.0 12.981943 27.731505 1.50 6.5800 7.730 9.0500 329.83
avg_checked_bags 5206.0 0.603338 0.512678 0.00 0.2500 0.500 1.0000 6.60
avg_distance_flown_km 5206.0 2348.176104 1749.255961 27.52 1476.5125 2000.040 2696.1900 15954.76
avg_flight_discount_amount 4721.0 0.141010 0.070932 0.05 0.1000 0.130 0.1800 0.55
avg_flight_price_usd 5206.0 550.205344 739.590156 5.35 282.7750 392.795 565.9625 14280.38
avg_flight_seats 5206.0 1.222578 0.454272 1.00 1.0000 1.000 1.3300 6.60
avg_hotel_discount_amount 4063.0 0.112437 0.054025 0.05 0.0800 0.100 0.1500 0.45
avg_hotel_price_usd 5435.0 178.272981 84.673031 24.00 124.3300 163.000 211.8350 1063.00
avg_hotel_rooms 5435.0 1.198267 0.352250 1.00 1.0000 1.000 1.3300 4.00
avg_page_clicks 5998.0 18.782508 11.969869 4.13 12.5600 15.670 19.8150 163.90
avg_session_duration_minute 5998.0 1.780123 1.602640 0.00 1.1000 1.440 1.8900 16.40
avg_stay_duration_day 5435.0 3.804942 3.007981 -1.00 2.0000 3.000 5.0000 30.00
cancellation_proportion 5566.0 0.047528 0.160093 0.00 0.0000 0.000 0.0000 1.00
conversion_rate 5998.0 0.333846 0.185690 0.00 0.2200 0.330 0.5000 1.00
discounted_flight_proportion 5998.0 0.187049 0.136935 0.00 0.1100 0.130 0.2500 0.70
discounted_hotel_proportion 5998.0 0.145782 0.126332 0.00 0.0000 0.130 0.2500 0.75
round_trips_proportion 5998.0 0.827904 0.351102 0.00 1.0000 1.000 1.0000 1.00
session_count 5998.0 8.427309 0.790536 8.00 8.0000 8.000 9.0000 14.00
total_flights_booked 5998.0 2.397966 1.577156 0.00 1.0000 2.000 3.0000 9.00
total_hotels_booked 5998.0 2.527509 1.511832 0.00 1.0000 2.000 4.0000 8.00
total_trips 5998.0 2.805435 1.589629 0.00 2.0000 3.000 4.0000 9.00
weekend_trip_proportion 5998.0 0.049298 0.137947 0.00 0.0000 0.000 0.0000 1.00
In [14]:
# flagging users based on flight bookings:True if the user has booked a flight and False if hasn't 
cohort_users['has_booked'] = ~cohort_users['cancellation_proportion'].isna()

# filling cancellation_proportion column null values with 0
cohort_users['cancellation_proportion'].fillna(0, inplace=True)
In [20]:
# Determine the position to insert the new column after the last_min_booking columns
position = cohort_users.columns.get_loc('avg_booking_departure_gap_days_hotels') + 1

# Insert the new column which is the maximum of the both columns at the desired position
cohort_users.insert(position, 'avg_booking_departure_gap_days', 
            cohort_users[['avg_booking_departure_gap_days_flights', 'avg_booking_departure_gap_days_hotels']].max(axis=1))

# Drop the original columns
cohort_users.drop(columns=['avg_booking_departure_gap_days_flights', 'avg_booking_departure_gap_days_hotels'], inplace=True)
In [21]:
# check the number of null values of the new column 'avg_booking_departure_gap_days'
null_values = cohort_users['avg_booking_departure_gap_days'].isna().sum()
print(f'We have {null_values} null values in avg_booking_departure_gap_days column.')
We have 432 null values in avg_booking_departure_gap_days column.
In [22]:
max_gap = cohort_users['avg_booking_departure_gap_days'].max()
cohort_users['avg_booking_departure_gap_days'].fillna(max_gap, inplace=True)
In [23]:
# Flag users who haven't booked a flight
cohort_users['has_booked_flight'] = ~cohort_users['avg_flight_price_usd'].isna()

# Columns to fill with 0
zero_fill_columns = ['avg_flight_discount_amount', 'avg_flight_seats', 'avg_checked_bags']

# Columns to fill with their medians
median_fill_columns = ['avg_flight_price_usd', 'avg_distance_flown_km']

# filling null values with zero
for col in zero_fill_columns:
    cohort_users[col].fillna(0, inplace=True)

# filling null values with median
for col in median_fill_columns:
    cohort_users[col].fillna(cohort_users[col].median(), inplace=True)
In [24]:
# Flag users who haven't booked a hotel
cohort_users['has_booked_hotel'] = ~cohort_users['avg_hotel_price_usd'].isna()

# Columns to fill with 0
zero_fill_columns = ['avg_hotel_discount_amount', 'avg_hotel_rooms', 'avg_stay_duration_day']

# Columns to fill with their medians
median_fill_columns = ['avg_hotel_price_usd']

# filling null values with zero
for col in zero_fill_columns:
    cohort_users[col].fillna(0, inplace=True)

# filling null values with median
for col in median_fill_columns:
    cohort_users[col].fillna(cohort_users[col].median(), inplace=True)

# checking the number of null values
null_values = cohort_users.isna().sum().sum()
print(f'We have {null_values} Null values in the dataset.')
We have 0 Null values in the dataset.
In [25]:
negative_numbers = cohort_users[cohort_users['avg_stay_duration_day'] < 0].shape[0]
percentage = round(cohort_users[cohort_users['avg_stay_duration_day'] < 0].shape[0]/
                   cohort_users.avg_stay_duration_day.count()*100,2)
print(f'Only {negative_numbers} records ({percentage}%) of avg_stay_duration_day column have negative amounts.')
Only 2 records (0.03%) of avg_stay_duration_day column have negative amounts.
In [26]:
# replacing the records with 0
cohort_users.loc[cohort_users['avg_stay_duration_day'] < 0, 'avg_stay_duration_day'] = 0

negative_numbers = cohort_users[cohort_users['avg_stay_duration_day'] < 0].shape[0]

# checking how many records we have with this criteria
print(f'Now {negative_numbers} records of the avg_stay_duration column is negative.')
Now 0 records of the avg_stay_duration column is negative.
In [81]:
# Select only numerical columns and excluding the user_id column
numerical_columns = [col for col in cohort_users.select_dtypes(include=['float64', 'int64']).columns if col != 'user_id']


# Set up the figure and axes
fig, axes = plt.subplots(len(numerical_columns), 2, figsize=(15, 4 * len(numerical_columns)))

for i, col in enumerate(numerical_columns):
    # Compute the outlier bounds: I commented out the lower_bound as in all the columns it's out of the range
    #lower_bound = cohort_users[col].mean() - 5 * cohort_users[col].std()
    upper_bound = cohort_users[col].mean() + 5 * cohort_users[col].std()
    mean = cohort_users[col].mean()
    
    # Plot histogram
    bins = int(np.log2(len(cohort_users[col])) + 1)  # Sturges' formula
    sns.histplot(cohort_users[col], ax=axes[i, 0], bins=bins, kde=True, edgecolor='black')
    axes[i, 0].set_title(f'Histogram of {col}')
    #axes[i, 0].axvline(lower_bound, color='r', linestyle='--', label=f'-5*STD')
    axes[i, 0].axvline(upper_bound, color='r', linestyle='--', label=f'5*STD')
    axes[i, 0].axvline(mean, color='r', linestyle='--', label='Mean')
    axes[i, 0].set_xlabel(col)
    axes[i, 0].set_ylabel('Frequency')
    axes[i, 0].legend()
    
    # Plot box plot
    sns.boxplot(x=cohort_users[col], ax=axes[i, 1], showfliers=True, boxprops=dict(alpha=0.7))
    axes[i, 1].axvline(upper_bound, color='r', linestyle='--', label=f'5*STD')  # Draw the 5*STD line on boxplot
    axes[i, 1].set_title(f'Boxplot of {col}')
    axes[i, 1].legend()

# Adjust layout
plt.tight_layout()
plt.show()
In [ ]:
 
In [29]:
# we had 5998 records
number_of_outliers = 5998 - cohort_users.shape[0]
percentage = round(number_of_outliers/5998 * 100,2)
print(f'{number_of_outliers} ({percentage}%) records were outliers!')
329 (5.49%) records were outliers!
In [30]:
# Calculating the average_dollar_saved_per_kilometre metric 
avg_usd_saved_per_km = round(cohort_users['avg_flight_discount_amount'] 
                                                    * cohort_users['avg_flight_price_usd'] 
                                                    / cohort_users['avg_distance_flown_km'],2)

# Determine the position to insert the new column after the last_min_booking columns
position = cohort_users.columns.get_loc('avg_distance_flown_km') + 1

# Insert the new column which is the maximum of the both columns at the desired position
cohort_users.insert(position, 'avg_usd_saved_per_km', avg_usd_saved_per_km)
In [31]:
# List of columns to scale 
cols_to_scale = ['married', 'has_children', 'age', 'session_count', 'avg_session_duration_minute', 'avg_page_clicks', 
                 'total_trips', 'conversion_rate', 'weekend_trip_proportion', 'cancellation_proportion', 
                 'avg_booking_departure_gap_days', 'total_flights_booked', 'round_trips_proportion', 
                 'avg_flight_price_usd', 'avg_flight_discount_amount', 'discounted_flight_proportion', 'avg_flight_seats', 
                 'avg_checked_bags', 'avg_distance_flown_km', 'avg_usd_saved_per_km', 'total_hotels_booked', 
                 'avg_hotel_price_usd', 'avg_hotel_discount_amount', 'discounted_hotel_proportion', 
                 'avg_hotel_rooms', 'avg_stay_duration_day']

# Initialize the StandardScaler
scaler = StandardScaler()
    
# Apply scaling and create new columns with suffix "_scaled"
scaled_data = scaler.fit_transform(cohort_users[cols_to_scale])
scaled_columns = ["scaled_" + col for col in cols_to_scale]
cohort_users[scaled_columns] = scaled_data.round(2)

# check the dataset 
cohort_users.head()
Out[31]:
user_id sign_up_date age gender married has_children home_country home_city session_count avg_session_duration_minute avg_page_clicks total_trips conversion_rate weekend_trip_proportion cancellation_proportion avg_booking_departure_gap_days total_flights_booked round_trips_proportion avg_flight_price_usd avg_flight_discount_amount discounted_flight_proportion avg_flight_seats avg_checked_bags avg_distance_flown_km avg_usd_saved_per_km total_hotels_booked avg_hotel_price_usd avg_hotel_discount_amount discounted_hotel_proportion avg_hotel_rooms avg_stay_duration_day has_booked has_booked_flight has_booked_hotel scaled_married scaled_has_children scaled_age scaled_session_count scaled_avg_session_duration_minute scaled_avg_page_clicks scaled_total_trips scaled_conversion_rate scaled_weekend_trip_proportion scaled_cancellation_proportion scaled_avg_booking_departure_gap_days scaled_total_flights_booked scaled_round_trips_proportion scaled_avg_flight_price_usd scaled_avg_flight_discount_amount scaled_discounted_flight_proportion scaled_avg_flight_seats scaled_avg_checked_bags scaled_avg_distance_flown_km scaled_avg_usd_saved_per_km scaled_total_hotels_booked scaled_avg_hotel_price_usd scaled_avg_hotel_discount_amount scaled_discounted_hotel_proportion scaled_avg_hotel_rooms scaled_avg_stay_duration_day
0 23557 2021-07-22 64 F True False usa new york 12 6.93 17.50 4 0.33 0.00 0.25 248.05 2 1.0 518.29 0.15 0.21 1.00 1.00 2680.73 0.03 4 157.33 0.18 0.29 1.33 7.33 True True True 1.13 -0.69 1.93 4.81 4.98 -0.04 0.73 -0.05 -0.38 2.59 2.07 -0.27 0.5 0.15 0.46 0.21 -0.06 1.10 0.45 0.31 0.95 -0.24 1.52 1.22 0.52 1.44
1 94883 2022-02-07 51 F True False usa kansas city 12 0.50 8.33 3 0.25 0.00 0.00 60.98 3 1.0 1784.95 0.10 0.08 1.67 1.00 5057.35 0.04 2 90.00 0.08 0.17 1.50 0.50 True True True 1.13 -0.69 0.83 4.81 -1.06 -0.92 0.10 -0.48 -0.38 -0.26 0.18 0.36 0.5 3.62 -0.13 -0.75 1.23 1.10 2.51 0.79 -0.37 -1.16 0.06 0.24 0.89 -1.03
3 101961 2022-02-17 43 F True False usa boston 12 2.00 18.17 7 0.58 0.14 0.00 7.34 6 1.0 321.53 0.13 0.25 1.00 0.50 1753.36 0.02 7 150.29 0.10 0.08 1.00 3.14 True True True 1.13 -0.69 0.15 4.81 0.35 0.03 2.61 1.29 0.82 -0.26 -0.36 2.26 0.5 -0.39 0.22 0.51 -0.06 -0.00 -0.35 -0.17 2.94 -0.34 0.35 -0.49 -0.19 -0.08
9 149058 2022-04-14 50 F False True usa birmingham 12 2.79 15.36 6 0.50 0.00 0.17 58.79 6 1.0 664.40 0.05 0.21 1.56 0.67 2304.81 0.01 6 165.67 0.00 0.14 1.11 8.56 True True True -0.89 1.45 0.75 4.81 1.09 -0.24 1.99 0.86 -0.38 1.68 0.16 2.26 0.5 0.55 -0.72 0.21 1.01 0.37 0.13 -0.65 2.28 -0.13 -1.11 0.00 0.05 1.88
10 152583 2022-04-17 37 F False False usa colorado springs 10 6.17 21.25 2 0.20 0.00 0.50 181.16 2 1.0 3769.43 0.15 0.25 1.80 1.20 10253.52 0.06 2 213.00 0.15 0.42 1.00 2.00 True True True -0.89 -0.69 -0.35 2.13 4.27 0.33 -0.53 -0.75 -0.38 5.43 1.40 -0.27 0.5 9.06 0.46 0.51 1.47 1.54 6.99 1.76 -0.37 0.52 1.08 2.27 -0.19 -0.49
In [57]:
# Extract the relevant columns for clustering
features = cohort_users[[
                  # Free hotel meal:
                  'scaled_has_children',
                  'scaled_age',
                  'scaled_avg_flight_seats', 
                  'scaled_avg_hotel_rooms',
                  # Free checked bag:
                  'scaled_avg_checked_bags',
                  'scaled_avg_stay_duration_day',
                  'scaled_avg_distance_flown_km',
                  'scaled_round_trips_proportion',
                  # No cancellation fees:
                  'scaled_cancellation_proportion',
                  'scaled_avg_booking_departure_gap_days',
                  # Exclusive discounts:
                  'scaled_conversion_rate',
                  'scaled_avg_page_clicks',
                  'scaled_avg_session_duration_minute',
                  'scaled_avg_flight_discount_amount',
                  'scaled_discounted_flight_proportion',
                  'scaled_avg_usd_saved_per_km',
                  'scaled_avg_hotel_discount_amount',
                  'scaled_discounted_hotel_proportion',  
                  # 1-night free hotel with a flight:
                  'scaled_total_flights_booked',
                  'scaled_total_hotels_booked',
                  'scaled_weekend_trip_proportion',
                  'scaled_total_trips'
                        ]]

# Compute the correlation matrix for the free_meal_index
correlation_matrix = features.corr()

# Set up the matplotlib figure
plt.figure(figsize=(15, 10))

# Generate a heatmap for the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1, linewidths=0.5, annot_kws={"size": 8})


# Adjust the x-axis labels for better readability
plt.xticks(rotation=90, ha='right')

# Set the title and show the plot
plt.title("Features Correlation Matrix Heatmap")
plt.tight_layout()
plt.show()
In [33]:
# Extract the relevant columns for clustering
features = cohort_users[[
                  # Free hotel meal:
                  #'scaled_has_children',
                  #'scaled_age',
                  #'scaled_avg_flight_seats', 
                  'scaled_avg_hotel_rooms',
                  # Free checked bag:
                  'scaled_avg_checked_bags',
                  #'scaled_avg_stay_duration_day',
                  #'scaled_avg_distance_flown_km',
                  #'scaled_round_trips_proportion',
                  # No cancellation fees:
                  'scaled_cancellation_proportion',
                  #'scaled_avg_booking_departure_gap_days',
                  # Exclusive discounts:
                  'scaled_conversion_rate',
                  #'scaled_avg_page_clicks',
                  #'scaled_avg_session_duration_minute',
                  #'scaled_avg_flight_discount_amount',
                  #'scaled_discounted_flight_proportion',
                  #'scaled_avg_usd_saved_per_km',
                  #'scaled_avg_hotel_discount_amount',
                  #'scaled_discounted_hotel_proportion',  
                  # 1-night free hotel with a flight:
                  #'scaled_total_flights_booked',
                  #'scaled_total_hotels_booked',
                  'scaled_weekend_trip_proportion',
                  #'scaled_total_trips'
                        ]]

# Display the first few rows of the selected features
features.head()
Out[33]:
scaled_avg_hotel_rooms scaled_avg_checked_bags scaled_cancellation_proportion scaled_conversion_rate scaled_weekend_trip_proportion
0 0.52 1.10 2.59 -0.05 -0.38
1 0.89 1.10 -0.26 -0.48 -0.38
3 -0.19 -0.00 -0.26 1.29 0.82
9 0.05 0.37 1.68 0.86 -0.38
10 -0.19 1.54 5.43 -0.75 -0.38
In [35]:
def get_silhouette_score(features, n_clusters):
    data_sample = features.sample(frac=1, random_state=1)
    model = KMeans(n_clusters=n_clusters, random_state=1)
    model.fit(data_sample)
    
    model_sil_score = silhouette_score(data_sample, model.labels_)
    
    print(f"The silhouette score using {n_clusters} number of clusters is: {round(model_sil_score, 3)}")
    
# calling the function for different number of clusters    
get_silhouette_score(features, 3)
get_silhouette_score(features, 4)
get_silhouette_score(features, 5)
get_silhouette_score(features, 6)
get_silhouette_score(features, 7)
The silhouette score using 3 number of clusters is: 0.314
The silhouette score using 4 number of clusters is: 0.367
The silhouette score using 5 number of clusters is: 0.351
The silhouette score using 6 number of clusters is: 0.378
The silhouette score using 7 number of clusters is: 0.394
In [36]:
# Retrieving the cluster labels assigned to each data point in our dataset.
cohort_users['cluster_label'] = model.labels_
model.labels_
Out[36]:
array([2, 0, 3, ..., 4, 4, 1], dtype=int32)
In [37]:
cohort_users.head()
Out[37]:
user_id sign_up_date age gender married has_children home_country home_city session_count avg_session_duration_minute avg_page_clicks total_trips conversion_rate weekend_trip_proportion cancellation_proportion avg_booking_departure_gap_days total_flights_booked round_trips_proportion avg_flight_price_usd avg_flight_discount_amount discounted_flight_proportion avg_flight_seats avg_checked_bags avg_distance_flown_km avg_usd_saved_per_km total_hotels_booked avg_hotel_price_usd avg_hotel_discount_amount discounted_hotel_proportion avg_hotel_rooms avg_stay_duration_day has_booked has_booked_flight has_booked_hotel scaled_married scaled_has_children scaled_age scaled_session_count scaled_avg_session_duration_minute scaled_avg_page_clicks scaled_total_trips scaled_conversion_rate scaled_weekend_trip_proportion scaled_cancellation_proportion scaled_avg_booking_departure_gap_days scaled_total_flights_booked scaled_round_trips_proportion scaled_avg_flight_price_usd scaled_avg_flight_discount_amount scaled_discounted_flight_proportion scaled_avg_flight_seats scaled_avg_checked_bags scaled_avg_distance_flown_km scaled_avg_usd_saved_per_km scaled_total_hotels_booked scaled_avg_hotel_price_usd scaled_avg_hotel_discount_amount scaled_discounted_hotel_proportion scaled_avg_hotel_rooms scaled_avg_stay_duration_day cluster_label
0 23557 2021-07-22 64 F True False usa new york 12 6.93 17.50 4 0.33 0.00 0.25 248.05 2 1.0 518.29 0.15 0.21 1.00 1.00 2680.73 0.03 4 157.33 0.18 0.29 1.33 7.33 True True True 1.13 -0.69 1.93 4.81 4.98 -0.04 0.73 -0.05 -0.38 2.59 2.07 -0.27 0.5 0.15 0.46 0.21 -0.06 1.10 0.45 0.31 0.95 -0.24 1.52 1.22 0.52 1.44 2
1 94883 2022-02-07 51 F True False usa kansas city 12 0.50 8.33 3 0.25 0.00 0.00 60.98 3 1.0 1784.95 0.10 0.08 1.67 1.00 5057.35 0.04 2 90.00 0.08 0.17 1.50 0.50 True True True 1.13 -0.69 0.83 4.81 -1.06 -0.92 0.10 -0.48 -0.38 -0.26 0.18 0.36 0.5 3.62 -0.13 -0.75 1.23 1.10 2.51 0.79 -0.37 -1.16 0.06 0.24 0.89 -1.03 0
3 101961 2022-02-17 43 F True False usa boston 12 2.00 18.17 7 0.58 0.14 0.00 7.34 6 1.0 321.53 0.13 0.25 1.00 0.50 1753.36 0.02 7 150.29 0.10 0.08 1.00 3.14 True True True 1.13 -0.69 0.15 4.81 0.35 0.03 2.61 1.29 0.82 -0.26 -0.36 2.26 0.5 -0.39 0.22 0.51 -0.06 -0.00 -0.35 -0.17 2.94 -0.34 0.35 -0.49 -0.19 -0.08 3
9 149058 2022-04-14 50 F False True usa birmingham 12 2.79 15.36 6 0.50 0.00 0.17 58.79 6 1.0 664.40 0.05 0.21 1.56 0.67 2304.81 0.01 6 165.67 0.00 0.14 1.11 8.56 True True True -0.89 1.45 0.75 4.81 1.09 -0.24 1.99 0.86 -0.38 1.68 0.16 2.26 0.5 0.55 -0.72 0.21 1.01 0.37 0.13 -0.65 2.28 -0.13 -1.11 0.00 0.05 1.88 2
10 152583 2022-04-17 37 F False False usa colorado springs 10 6.17 21.25 2 0.20 0.00 0.50 181.16 2 1.0 3769.43 0.15 0.25 1.80 1.20 10253.52 0.06 2 213.00 0.15 0.42 1.00 2.00 True True True -0.89 -0.69 -0.35 2.13 4.27 0.33 -0.53 -0.75 -0.38 5.43 1.40 -0.27 0.5 9.06 0.46 0.51 1.47 1.54 6.99 1.76 -0.37 0.52 1.08 2.27 -0.19 -0.49 2
In [38]:
# Constructing a DataFrame to represent the cluster centroids with respective feature values.
model_centroids = pd.DataFrame(model.cluster_centers_, columns=features.columns)

# Displaying the coordinates/values of each cluster centroid in a tabular format.
display(model_centroids)

# Plotting a heatmap to visually represent the position and magnitude of each cluster centroid in the feature space.
plt.figure(figsize=(10, 6))
sns.heatmap(model_centroids.T, cmap="Blues", annot=True)
plt.xlabel("Cluster centroids", fontsize=14)
plt.tight_layout()
plt.title("Centroid positions in high-dimensional space", fontsize=15)
scaled_avg_hotel_rooms scaled_avg_checked_bags scaled_cancellation_proportion scaled_conversion_rate scaled_weekend_trip_proportion
0 0.203140 0.718430 -0.252331 0.398117 -0.373519
1 -2.340000 -0.966199 -0.260000 -1.723496 -0.380000
2 0.171943 0.342979 3.462332 0.321010 0.004534
3 0.215061 0.108494 -0.238060 0.628589 2.301248
4 0.258524 -0.790872 -0.260000 -0.372912 -0.380000
Out[38]:
Text(0.5, 1.0, 'Centroid positions in high-dimensional space')
In [39]:
cluster_dict = {0:'Free Checked Bag',
               1: 'Exclusive Discounts',
               2: 'No Cancellation Fee',
               3: '1-night free hotel with a flight',
               4: 'Free Hotel Meal'} 

# Add a new column to the DataFrame to store the perk names
cohort_users['perk'] = cohort_users['cluster_label'].map(cluster_dict)

# Count the number of users in each segment
segment_counts = cohort_users['perk'].value_counts().reset_index()
segment_counts.columns = ['perk', 'user_count']

# Display the number of users in each segment
segment_counts
Out[39]:
perk user_count
0 Free Checked Bag 2299
1 Free Hotel Meal 1755
2 1-night free hotel with a flight 737
3 Exclusive Discounts 492
4 No Cancellation Fee 386
In [40]:
grouped_df = features.join(cohort_users['perk']).groupby('perk').mean()
grouped_df
Out[40]:
scaled_avg_hotel_rooms scaled_avg_checked_bags scaled_cancellation_proportion scaled_conversion_rate scaled_weekend_trip_proportion
perk
1-night free hotel with a flight 0.215061 0.108494 -0.238060 0.628589 2.301248
Exclusive Discounts -2.340000 -0.966199 -0.260000 -1.723496 -0.380000
Free Checked Bag 0.203140 0.718430 -0.252331 0.398117 -0.373519
Free Hotel Meal 0.258524 -0.790872 -0.260000 -0.372912 -0.380000
No Cancellation Fee 0.171943 0.342979 3.462332 0.321010 0.004534
In [41]:
filtered_columns = [col for col in grouped_df.columns if col != 'scaled_cancellation_proportion'] 

# Extract unique segment labels from the 'perk' column
segments = cohort_users['perk'].unique()

# Calculate the number of rows and columns for the subplot grid
n_rows = len(filtered_columns)  # One row per feature
n_cols = 2  # Two columns per row

# Create subplots with explicit layout
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * len(filtered_columns)))


# Loop through each feature and create KDE plots for each segment
for i, feature in enumerate(filtered_columns):
    row, col = divmod(i, n_cols)  # Calculate row and column for current subplot
    for segment in segments:
        sns.kdeplot(
            cohort_users[cohort_users['perk'] == segment][feature],
            label=segment,
            ax=axes[row, col]
        )
    
    axes[row, col].set_title(f'Distribution of {feature} Within Each Segment')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Density')
    axes[row, col].legend(title='Segment')


# Hide any remaining empty subplots
for i in range(len(filtered_columns), n_rows * n_cols):
    row, col = divmod(i, n_cols)  # Calculate row and column for current subplot
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.show()
In [42]:
# Calculate the number of rows for the subplot grid (one row per feature)
n_rows = len(features.columns)

# Create subplots with explicit layout, one subplot per row
fig, axes = plt.subplots(n_rows, 1, figsize=(12, 4 * len(features.columns)))

# Loop through each feature to create box plots for each segment
for i, feature in enumerate(features.columns):
    sns.boxplot(y='perk', x=feature, data=cohort_users, ax=axes[i])
    
    axes[i].set_title(f'Box Plot of {feature} Across Each Perk')
    axes[i].set_xlabel('Feature Values')
    axes[i].set_ylabel('Perk')

plt.tight_layout()
plt.show()
In [43]:
from sklearn.metrics.pairwise import euclidean_distances

# Make a copy of the DataFrame slice
features_copy = features.copy()

# Add the cluster labels to the DataFrame containing the features
features_copy['cluster_label'] = cohort_users['cluster_label']
features_copy['perk'] = cohort_users['perk']

# Initialize an empty column to store the distances to the cluster centroid
features_copy['dist_center'] = None

# Loop through each cluster to calculate the distance of each point to its cluster's centroid
for cluster in model_centroids.index:
    cluster_sample = features_copy.loc[features_copy['cluster_label'] == cluster]
    
    features_copy.loc[features_copy['cluster_label'] == cluster, 'dist_center'] = euclidean_distances(
        cluster_sample.iloc[:, :-3],  # Exclude the last three columns: 'cluster_label', 'perk', 'dist_center'
        model_centroids.loc[cluster, :].values.reshape(1, -1)
    )
    
# Calculate the average distance of the data points in each cluster to their respective centroid
avg_dist_to_centroid = features_copy.groupby('perk', as_index=False)['dist_center'].mean()
avg_dist_to_centroid
Out[43]:
perk dist_center
0 1-night free hotel with a flight 1.500694
1 Exclusive Discounts 0.331363
2 Free Checked Bag 1.209937
3 Free Hotel Meal 1.024512
4 No Cancellation Fee 2.027350
In [44]:
# Define the function to draw representative examples from each cluster (perk category in this case)
def draw_examples(df, perk, max_dist, n):
    print(f"Examples from cluster {perk} with less than {max_dist} distance to the cluster centroid")
    return df.loc[(df['perk'] == perk) & (df['dist_center'] < max_dist)].sample(n)

# Draw 5 representative examples from the 'No Cancellation Fee' cluster
# Replace 0.4 with the appropriate maximum distance based on your specific dataset
draw_examples(features_copy, perk='Exclusive Discounts', max_dist=0.4, n=5)
Examples from cluster Exclusive Discounts with less than 0.4 distance to the cluster centroid
Out[44]:
scaled_avg_hotel_rooms scaled_avg_checked_bags scaled_cancellation_proportion scaled_conversion_rate scaled_weekend_trip_proportion cluster_label perk dist_center
5494 -2.34 -1.11 -0.26 -1.82 -0.38 1 Exclusive Discounts 0.173181
1407 -2.34 -1.11 -0.26 -1.82 -0.38 1 Exclusive Discounts 0.173181
5771 -2.34 -1.11 -0.26 -1.82 -0.38 1 Exclusive Discounts 0.173181
1224 -2.34 -1.11 -0.26 -1.82 -0.38 1 Exclusive Discounts 0.173181
3225 -2.34 -1.11 -0.26 -1.82 -0.38 1 Exclusive Discounts 0.173181
In [45]:
# Calculate various metrics for each cluster (perk)
cluster_metrics = features_copy.groupby('perk', as_index=False).agg({
    'scaled_conversion_rate': 'mean',
    'scaled_avg_hotel_rooms': 'mean',
    'scaled_avg_checked_bags': 'mean',
    'scaled_cancellation_proportion': 'mean',
    'scaled_weekend_trip_proportion': 'mean'
})

# Rename columns for clarity
cluster_metrics.columns = ['Perk', 'Avg Conversion Rate', 'Avg Hotel Rooms', 'Avg Checked Bags', 
                           'Avg Cancellation Proportion', 'Avg Weekend Trip Proportion']

# Display the calculated metrics as a table
cluster_metrics
Out[45]:
Perk Avg Conversion Rate Avg Hotel Rooms Avg Checked Bags Avg Cancellation Proportion Avg Weekend Trip Proportion
0 1-night free hotel with a flight 0.628589 0.215061 0.108494 -0.238060 2.301248
1 Exclusive Discounts -1.723496 -2.340000 -0.966199 -0.260000 -0.380000
2 Free Checked Bag 0.398117 0.203140 0.718430 -0.252331 -0.373519
3 Free Hotel Meal -0.372912 0.258524 -0.790872 -0.260000 -0.380000
4 No Cancellation Fee 0.321010 0.171943 0.342979 3.462332 0.004534
In [58]:
# Count the number of customers in each segment
segment_count = cohort_users['perk'].value_counts().reset_index()
segment_count.columns = ['Segment', 'Number of Customers']

# Plotting the bar chart
plt.figure(figsize=(12, 6))
sns.barplot(x='Number of Customers', y='Segment', data=segment_count, palette='cool')
plt.title('Distribution of Customers Across Different Segments')
plt.xlabel('Number of Customers')
plt.ylabel('Segment')

# Annotate bars with count values
for index, value in enumerate(segment_count['Number of Customers']):
    plt.text(value, index, str(value), ha='center', va='center', fontsize=10, color='black')

#plt.tight_layout()
plt.show()
In [63]:
# to use it in the presentation file:

def func(pct, allvalues): 
    absolute = int(pct/100.*np.sum(allvalues))
    return "{:.1f}%\n({:d} users)".format(pct, absolute)

# Data to plot
sizes = segment_count['Number of Customers']
labels = segment_count['Segment']
colors = sns.color_palette('cool', len(labels))

# Plotting the Pie chart
plt.figure(figsize=(7,7))
plt.pie(sizes, labels=labels, colors=colors, autopct=lambda pct: func(pct, sizes), startangle=140, pctdistance=0.85)

# Drawing center circle for 'Donut' style
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# Use the y parameter to adjust the position of the title
plt.title('Distribution of Customers Across Different Segments', y=1.08)

# Equal aspect ratio ensures that pie is drawn as a circle
plt.axis('equal')  
plt.tight_layout()
plt.show()
In [62]:
# Calculate the average age for each segment
avg_age_segment = cohort_users.groupby('perk')['age'].mean().reset_index()
avg_age_segment.columns = ['Segment', 'Average Age']

# Plotting the bar chart for average age
plt.figure(figsize=(12, 6))
ax = sns.barplot(x='Average Age', y='Segment', data=avg_age_segment, palette='cool')

# Annotate the average age of each segment
for index, value in enumerate(avg_age_segment['Average Age']):
    ax.text(value - 0.5, index, str(round(value, 2)))

plt.title('Average Age of Customers Across Different Segments')
plt.xlabel('Average Age')
plt.ylabel('Segment')
plt.show()
In [68]:
# Gender analysis: drawing the countplot for gender distribution across different perks using
plt.figure(figsize=(12, 6))
sns.countplot(data=cohort_users, x='perk', hue='gender', palette="cool")
plt.title('Gender Distribution by Perk')
plt.xlabel('Perk')
plt.ylabel('Number of Customers')
plt.xticks(rotation=15)
plt.show()
In [67]:
# Plotting the distribution of customers across top 5 cities and their most preferred perk
top_5_cities = cohort_users['home_city'].value_counts().nlargest(5).index.tolist()

# Filter the data to only include the top 5 cities
top_5_cities_data = cohort_users[cohort_users['home_city'].isin(top_5_cities)]

# Generate a horizontal bar chart for the top 5 cities
plt.figure(figsize=(15, 8))
sns.countplot(data=top_5_cities_data, y='home_city', hue='perk', palette="cool")
plt.title('Top 5 Cities: Distribution of Customers by Preferred Perk')
plt.xlabel('Number of Customers')
plt.ylabel('City')
plt.legend(title='Preferred Perk', bbox_to_anchor=(1, 1), loc='upper left')
plt.tight_layout()
plt.show()
In [84]:
final_segmentation_df = cohort_users[['user_id','perk']]

# Save the DataFrame to a CSV file
final_segmentation_df.to_csv('TravelTide_Customer_Segmentation_Kmean.csv', index=False)

# Display the first few rows of the final DataFrame
final_segmentation_df.head()
Out[84]:
user_id perk
0 23557 No Cancellation Fee
1 94883 Free Checked Bag
3 101961 1-night free hotel with a flight
9 149058 No Cancellation Fee
10 152583 No Cancellation Fee
In [ ]: